Importing Libraries
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(rlang)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%() masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke() masks rlang::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::splice() masks rlang::splice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(corrplot)
## corrplot 0.92 loaded
loading the Dataset
Cleaned_bitcoin_mining <- read.csv("Cleaned_bitcoin_mining.csv")
head(Cleaned_bitcoin_mining)
## Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## 1 2010-07-18T00:00:00 2.67e-05 2.24e-05 2.44e-05
## 2 2010-07-19T00:00:00 2.68e-05 2.26e-05 2.46e-05
## 3 2010-07-20T00:00:00 2.72e-05 2.29e-05 2.50e-05
## 4 2010-07-21T00:00:00 2.84e-05 2.39e-05 2.61e-05
## 5 2010-07-22T00:00:00 2.82e-05 2.37e-05 2.59e-05
## 6 2010-07-23T00:00:00 2.85e-05 2.40e-05 2.61e-05
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 1 0.000233717 0.000196712
## 2 0.000235075 0.000197855
## 3 0.000238699 0.000200905
## 4 0.000249343 0.000209864
## 5 0.000247305 0.000208148
## 6 0.000250023 0.000210436
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 1 0.000214241 14313700
## 2 0.000215486 14313700
## 3 0.000218808 14313700
## 4 0.000228565 14313700
## 5 0.000226696 14313700
## 6 0.000229188 14313700
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## 1 14313700 14313700 4e-06
## 2 14313700 14313700 5e-06
## 3 14313700 14313700 5e-06
## 4 14313700 14313700 5e-06
## 5 14313700 14313700 5e-06
## 6 14313700 14313700 5e-06
## Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 1 0.000119 0.000214 554.1215
## 2 0.000119 0.000216 554.1215
## 3 0.000121 0.000219 554.1215
## 4 0.000127 0.000229 554.1215
## 5 0.000126 0.000227 554.1215
## 6 0.000127 0.000229 554.1215
## Hash.rate.MH.s
## 1 0.001606373
## 2 0.001822962
## 3 0.001822962
## 4 0.001750766
## 5 0.001669545
## 6 0.001669545
Checking the dimension and Structure of data
dim(Cleaned_bitcoin_mining)
## [1] 4815 15
str(Cleaned_bitcoin_mining)
## 'data.frame': 4815 obs. of 15 variables:
## $ Date.and.Time : chr "2010-07-18T00:00:00" "2010-07-19T00:00:00" "2010-07-20T00:00:00" "2010-07-21T00:00:00" ...
## $ power.MAX..GW : num 2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
## $ power.MIN..GW : num 2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
## $ power.GUESS..GW : num 2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
## $ annualised.consumption.MAX..TWh : num 0.000234 0.000235 0.000239 0.000249 0.000247 ...
## $ annualised.consumption.MIN..TWh : num 0.000197 0.000198 0.000201 0.00021 0.000208 ...
## $ annualised.consumption.GUESS..TWh: num 0.000214 0.000215 0.000219 0.000229 0.000227 ...
## $ Lower.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Estimated.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Upper.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Hydro.only..MtCO2e : num 4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
## $ Estimated..MtCO2e : num 0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
## $ Coal.only..MtCO2e : num 0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
## $ Emission.intensity..gCO2e.kWh : num 554 554 554 554 554 ...
## $ Hash.rate.MH.s : num 0.00161 0.00182 0.00182 0.00175 0.00167 ...
Our Dataset contains 4,815 observations(rows) and 15
variables(columns). The structure of the bitcoin mining dataset reveals
information related to power consumption, efficiency, CO2 emissions, and
hash rates.
Summary Statistics
summary(Cleaned_bitcoin_mining)
## Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## Length:4815 Min. : 0.00003 Min. :0.000022 Min. : 0.000024
## Class :character 1st Qu.: 0.39179 1st Qu.:0.031152 1st Qu.: 0.154086
## Mode :character Median : 2.12457 Median :0.384142 Median : 0.905217
## Mean : 9.82974 Mean :2.039373 Mean : 3.989582
## 3rd Qu.:15.41883 3rd Qu.:4.049493 3rd Qu.: 7.710647
## Max. :56.01570 Max. :8.947454 Max. :15.063222
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## Min. : 0.0002 Min. : 0.0002
## 1st Qu.: 3.4344 1st Qu.: 0.2731
## Median : 18.6240 Median : 3.3674
## Mean : 86.1675 Mean :17.8771
## 3rd Qu.:135.1615 3rd Qu.:35.4978
## Max. :491.0337 Max. :78.4334
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## Min. : 0.00021 Min. : 21
## 1st Qu.: 1.35072 1st Qu.: 38
## Median : 7.93513 Median : 98
## Mean : 34.97267 Mean : 458086
## 3rd Qu.: 67.59153 3rd Qu.: 9917
## Max. :132.04420 Max. :14313700
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## Min. : 31 Min. : 46 Min. :0.000004
## 1st Qu.: 68 1st Qu.: 167 1st Qu.:0.028365
## Median : 261 Median : 766 Median :0.166638
## Mean : 771891 Mean : 1292594 Mean :0.734426
## 3rd Qu.: 36553 3rd Qu.: 75000 3rd Qu.:1.419422
## Max. :14313700 Max. :14313700 Max. :2.772928
## Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## Min. : 0.00012 Min. : 0.00021 Min. :359.5
## 1st Qu.: 0.75628 1st Qu.: 1.35207 1st Qu.:512.8
## Median : 4.22858 Median : 7.94307 Median :533.7
## Mean :17.95686 Mean : 35.00765 Mean :532.2
## 3rd Qu.:31.96006 3rd Qu.: 67.65912 3rd Qu.:559.0
## Max. :66.90830 Max. :132.17625 Max. :594.6
## Hash.rate.MH.s
## Min. : 0
## 1st Qu.: 3838
## Median : 3210303
## Mean : 64397862
## 3rd Qu.:111495251
## Max. :506061817
From the summary Statistics, we can see the distribution and range
of each variable, as well as the presence of missing values.
Data cleaning
Checking for missing values
sum(is.na(Cleaned_bitcoin_mining))
## [1] 0
There are No Null values as this is the Cleaned dataset, Every
column has complete data for all the rows.
Checking number of Unique values
sapply(Cleaned_bitcoin_mining, function(x) length(unique(x)))
## Date.and.Time power.MAX..GW
## 4815 4767
## power.MIN..GW power.GUESS..GW
## 4745 4771
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 4771 4750
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 4774 24
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th
## 275 44
## Hydro.only..MtCO2e Estimated..MtCO2e
## 4543 4757
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 4761 39
## Hash.rate.MH.s
## 3801
Date and time has 4815 unique values which means that each row
corresponds to a unique timestamp. Most of the columns have a large
number of unique values, suggesting continous data, but few columns like
” lower Bound eficiency, J/th”, “Upper bound efficiency, J/th”, and
“Emission intensity, gCO2e/kWh” have fewer values, indicating potential
categories or repeated measurements.
Changing of “data and time” datatype
Cleaned_bitcoin_mining$'Date.and.Time' <- as.POSIXct(Cleaned_bitcoin_mining$'Date.and.Time',format= "%Y-%m-%dT%H:%M:%S")
str(Cleaned_bitcoin_mining)
## 'data.frame': 4815 obs. of 15 variables:
## $ Date.and.Time : POSIXct, format: "2010-07-18" "2010-07-19" ...
## $ power.MAX..GW : num 2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
## $ power.MIN..GW : num 2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
## $ power.GUESS..GW : num 2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
## $ annualised.consumption.MAX..TWh : num 0.000234 0.000235 0.000239 0.000249 0.000247 ...
## $ annualised.consumption.MIN..TWh : num 0.000197 0.000198 0.000201 0.00021 0.000208 ...
## $ annualised.consumption.GUESS..TWh: num 0.000214 0.000215 0.000219 0.000229 0.000227 ...
## $ Lower.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Estimated.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Upper.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Hydro.only..MtCO2e : num 4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
## $ Estimated..MtCO2e : num 0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
## $ Coal.only..MtCO2e : num 0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
## $ Emission.intensity..gCO2e.kWh : num 554 554 554 554 554 ...
## $ Hash.rate.MH.s : num 0.00161 0.00182 0.00182 0.00175 0.00167 ...
class(Cleaned_bitcoin_mining$Date.and.Time)
## [1] "POSIXct" "POSIXt"
date_range <- range(Cleaned_bitcoin_mining$Date.and.Time)
date_range
## [1] "2010-07-18 EDT" "2023-09-22 EDT"
we are changing the data and time datatype to POSIXct as many
plotting functions understand ’POSIXct/ POSIXit and will correctly
format axes and labels when ploting datetime values, and is better for
data manipulations and operations.
Univariate Analysis
variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th',
'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e',
'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')
# Improve variable names for display
var_names <- c('Power (GW)', 'Annualised Consumption (TWh)', 'Estimated Efficiency (J/Th)',
'Hydro Only Emissions (MtCO2e)', 'Estimated Emissions (MtCO2e)', 'Coal Only Emissions (MtCO2e)',
'Emission Intensity (gCO2e/kWh)', 'Hash Rate (MH/s)')
# Convert data to long format for facetting
df_long <- Cleaned_bitcoin_mining %>%
select(all_of(variables)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
df_long$Variable <- factor(df_long$Variable, levels = variables, labels = var_names)
# Plot
p <- ggplot(df_long, aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
facet_wrap(~ Variable, scales = "free", ncol = 2) +
theme_minimal() +
labs(title = "Histograms of Selected Variables", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

for(i in 1:length(variables)) {
# Subset data for the variable
df_subset <- df_long[df_long$Variable == var_names[i], ]
p <- ggplot(df_subset, aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
labs(title = paste("Histogram of", var_names[i]), y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers
Boxplots to visualize outliers
for(i in 1:length(variables)) {
p <- ggplot(Cleaned_bitcoin_mining, aes(y = Cleaned_bitcoin_mining[[variables[i]]])) +
geom_boxplot(fill = '#66c2a5', color = '#004d40', outlier.color = "red", outlier.size = 2) +
labs(title = paste("Box Plot of", var_names[i]), y = var_names[i]) +
theme_minimal()
print(p)
}
## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

IQR
variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th',
'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e',
'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')
outliers_counts <- sapply(variables, function(var) {
Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- Cleaned_bitcoin_mining[[var]][Cleaned_bitcoin_mining[[var]] < lower_bound |
Cleaned_bitcoin_mining[[var]] > upper_bound]
length(outliers)
})
names(outliers_counts) <- variables
outliers_counts
## power.GUESS..GW annualised.consumption.GUESS..TWh
## 0 0
## Estimated.efficiency..J.Th Hydro.only..MtCO2e
## 1097 0
## Estimated..MtCO2e Coal.only..MtCO2e
## 0 0
## Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## 214 254
Cap/Floor Outliers
Cleaned_bitcoin_mining_copy <- Cleaned_bitcoin_mining
for(var in variables) {
Q1 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.25)
Q3 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
Cleaned_bitcoin_mining_copy[[var]] <- ifelse(Cleaned_bitcoin_mining_copy[[var]] < lower_bound, lower_bound,
ifelse(Cleaned_bitcoin_mining_copy[[var]] > upper_bound, upper_bound,
Cleaned_bitcoin_mining_copy[[var]]))
}
summary(Cleaned_bitcoin_mining_copy[variables])
## power.GUESS..GW annualised.consumption.GUESS..TWh
## Min. : 0.000024 Min. : 0.00021
## 1st Qu.: 0.154086 1st Qu.: 1.35072
## Median : 0.905217 Median : 7.93513
## Mean : 3.989582 Mean : 34.97267
## 3rd Qu.: 7.710647 3rd Qu.: 67.59153
## Max. :15.063222 Max. :132.04420
## Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e
## Min. : 31.13 Min. :0.000004 Min. : 0.00012
## 1st Qu.: 67.72 1st Qu.:0.028365 1st Qu.: 0.75628
## Median : 260.92 Median :0.166638 Median : 4.22858
## Mean :23180.17 Mean :0.734426 Mean :17.95686
## 3rd Qu.:36553.00 3rd Qu.:1.419422 3rd Qu.:31.96006
## Max. :91280.91 Max. :2.772928 Max. :66.90830
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## Min. : 0.00021 Min. :443.5 Min. : 0
## 1st Qu.: 1.35207 1st Qu.:512.8 1st Qu.: 3838
## Median : 7.94307 Median :533.7 Median : 3210303
## Mean : 35.00765 Mean :534.2 Mean : 60413666
## 3rd Qu.: 67.65912 3rd Qu.:559.0 3rd Qu.:111495251
## Max. :132.17625 Max. :594.6 Max. :278732371
for (var in variables) {
p <- ggplot(Cleaned_bitcoin_mining_copy, aes_string(x = var)) +
geom_histogram(aes(y = ..count..),fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
labs(title = paste("Histogram of", var, "after Capping/Flooring"), y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Apply log transformation
Cleaned_bitcoin_mining_log <- Cleaned_bitcoin_mining
for (var in variables) {
Cleaned_bitcoin_mining_log[[paste0("log_", var)]] <- log1p(Cleaned_bitcoin_mining[[var]])
}
# Visualize the log-transformed data
for (var in paste0("log_", variables)) {
# Plot
p <- ggplot(Cleaned_bitcoin_mining_log, aes_string(x = var)) +
geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
labs(title = paste("Histogram of Log Transformed", var), y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Segmaentation Analysis
It’s a method used to divide a data set into subsets (with
outliers(original data) & without outliers)
data_without_outliers <- Cleaned_bitcoin_mining
for (var in variables) {
Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data_without_outliers <- data_without_outliers[data_without_outliers[[var]] >= lower_bound & data_without_outliers[[var]] <= upper_bound, ]
}
data_with_outliers <- Cleaned_bitcoin_mining
summary_without_outliers <- summary(data_without_outliers[variables])
summary_with_outliers <- summary(data_with_outliers[variables])
list(Without_Outliers = summary_without_outliers, With_Outliers = summary_with_outliers)
## $Without_Outliers
## power.GUESS..GW annualised.consumption.GUESS..TWh
## Min. : 0.000478 Min. : 0.00419
## 1st Qu.: 0.539391 1st Qu.: 4.72830
## Median : 3.611244 Median : 31.65617
## Mean : 4.398428 Mean : 38.55662
## 3rd Qu.: 8.442700 3rd Qu.: 74.00870
## Max. :13.266792 Max. :116.29670
## Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e
## Min. : 33.43 Min. :0.000088 Min. : 0.00236
## 1st Qu.: 68.34 1st Qu.:0.099294 1st Qu.: 2.57071
## Median : 182.88 Median :0.664779 Median :16.67565
## Mean : 3524.93 Mean :0.809689 Mean :20.32547
## 3rd Qu.: 850.32 3rd Qu.:1.554183 3rd Qu.:38.74943
## Max. :58750.00 Max. :2.442231 Max. :64.73054
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## Min. : 0.0042 Min. :462.5 Min. : 7
## 1st Qu.: 4.7330 1st Qu.:512.9 1st Qu.: 450189
## Median : 31.6878 Median :533.7 Median : 15023580
## Mean : 38.5952 Mean :533.9 Mean : 60112171
## 3rd Qu.: 74.0827 3rd Qu.:554.5 3rd Qu.:112650400
## Max. :116.4130 Max. :594.6 Max. :277924882
##
## $With_Outliers
## power.GUESS..GW annualised.consumption.GUESS..TWh
## Min. : 0.000024 Min. : 0.00021
## 1st Qu.: 0.154086 1st Qu.: 1.35072
## Median : 0.905217 Median : 7.93513
## Mean : 3.989582 Mean : 34.97267
## 3rd Qu.: 7.710647 3rd Qu.: 67.59153
## Max. :15.063222 Max. :132.04420
## Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e
## Min. : 31 Min. :0.000004 Min. : 0.00012
## 1st Qu.: 68 1st Qu.:0.028365 1st Qu.: 0.75628
## Median : 261 Median :0.166638 Median : 4.22858
## Mean : 771891 Mean :0.734426 Mean :17.95686
## 3rd Qu.: 36553 3rd Qu.:1.419422 3rd Qu.:31.96006
## Max. :14313700 Max. :2.772928 Max. :66.90830
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## Min. : 0.00021 Min. :359.5 Min. : 0
## 1st Qu.: 1.35207 1st Qu.:512.8 1st Qu.: 3838
## Median : 7.94307 Median :533.7 Median : 3210303
## Mean : 35.00765 Mean :532.2 Mean : 64397862
## 3rd Qu.: 67.65912 3rd Qu.:559.0 3rd Qu.:111495251
## Max. :132.17625 Max. :594.6 Max. :506061817
Bi variate Analysis
Corelation Matrix
cor_matrix <- cor(Cleaned_bitcoin_mining[variables], use = "complete.obs")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(cor_matrix, method = "color", type = "upper",
col = col(200),
tl.col = "black",
tl.srt = 90,
order = "hclust",
addCoef.col = "black",
number.cex = 0.5,
title = "Correlation Matrix", mar=c(0,0,1,0))

Highly Correlated Variables :
Sample T-test to compare the Power.Guess..GW before and after jan
1st 2013
before_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time < as.Date("2013-01-03"))
after_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time >= as.Date("2013-01-03"))
t_result <- t.test(before_2013$power.GUESS..GW, after_2013$power.GUESS..GW)
print(t_result)
##
## Welch Two Sample t-test
##
## data: before_2013$power.GUESS..GW and after_2013$power.GUESS..GW
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.036382 -4.749997
## sample estimates:
## mean of x mean of y
## 0.01100724 4.90419668
T-test for Selected Variables
results <- list()
for(var in variables) {
if(any(is.na(before_2013[[var]])) || any(is.na(after_2013[[var]]))) {
results[[var]] <- "Contains NA values"
} else if(length(unique(before_2013[[var]])) == 1 || length(unique(after_2013[[var]])) == 1) {
results[[var]] <- "Constant values in one or both periods"
} else {
result <- t.test(before_2013[[var]], after_2013[[var]])
results[[var]] <- result
}
}
for(var in variables) {
cat("T-test results for", var, ":\n")
print(results[[var]])
cat("\n---------------------------------------------\n")
}
## T-test results for power.GUESS..GW :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.036382 -4.749997
## sample estimates:
## mean of x mean of y
## 0.01100724 4.90419668
##
##
## ---------------------------------------------
## T-test results for annualised.consumption.GUESS..TWh :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -44.14892 -41.63848
## sample estimates:
## mean of x mean of y
## 0.09648949 42.99018811
##
##
## ---------------------------------------------
## T-test results for Estimated.efficiency..J.Th :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = 25.546, df = 899.31, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3659260 4268296
## sample estimates:
## mean of x mean of y
## 3994776.19 30998.28
##
##
## ---------------------------------------------
## T-test results for Hydro.only..MtCO2e :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9271274 -0.8744080
## sample estimates:
## mean of x mean of y
## 0.002026276 0.902793953
##
##
## ---------------------------------------------
## T-test results for Estimated..MtCO2e :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -66.447, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.66799 -21.36865
## sample estimates:
## mean of x mean of y
## 0.05411622 22.07243658
##
##
## ---------------------------------------------
## T-test results for Coal.only..MtCO2e :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -44.19307 -41.68011
## sample estimates:
## mean of x mean of y
## 0.09658598 43.03317831
##
##
## ---------------------------------------------
## T-test results for Emission.intensity..gCO2e.kWh :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = 53.295, df = 4099.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32.17905 34.63700
## sample estimates:
## mean of x mean of y
## 559.4096 526.0015
##
##
## ---------------------------------------------
## T-test results for Hash.rate.MH.s :
##
## Welch Two Sample t-test
##
## data: before_2013[[var]] and after_2013[[var]]
## t = -47.902, df = 3914, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -82443618 -75960296
## sample estimates:
## mean of x mean of y
## 9.051497e+00 7.920197e+07
##
##
## ---------------------------------------------